{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "0_656arexQh0"
   },
   "source": [
    "# Advanced RAG: Context Enrichment Window\n",
    "\n",
    "Vanilla RAG is great, but some situations need smaller chunks because larger ones can add unnecessary noise, like conversation history. Using couple-level chunks can work, but important context might be lost from previous or future replies. Bigger chunks could help, but they come with their own issues, like noise and limited chunk numbers. What's the Solution: **Context Enrichment**.\n",
    "\n",
    "Let's see How it can be done.\n",
    "\n",
    "\n",
    "![image.png]()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "HOUJKmlTvqDo",
    "outputId": "41544b27-d652-4f4d-e57d-4cd94bc5459d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.7/41.7 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.8/34.8 MB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m470.2/470.2 kB\u001b[0m \u001b[31m29.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.2/11.2 MB\u001b[0m \u001b[31m75.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.1/4.1 MB\u001b[0m \u001b[31m38.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.1/24.1 MB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m51.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m32.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m61.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m15.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m122.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25h"
     ]
    }
   ],
   "source": [
    "# install dependencies\n",
    "! pip install -U openai lancedb einops sentence-transformers transformers datasets tantivy rerankers langchain PyMuPDF -qq"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "gsZOASUOyMQa"
   },
   "source": [
    "## Download data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "7M-epvcmvWOw",
    "outputId": "f6d7ec05-256d-4609-c3cc-a1513737194f"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mkdir: cannot create directory ‘./data’: File exists\n",
      "--2025-07-29 18:27:18--  https://ncert.nic.in/textbook/pdf/jess301.pdf\n",
      "Resolving ncert.nic.in (ncert.nic.in)... 164.100.166.133\n",
      "Connecting to ncert.nic.in (ncert.nic.in)|164.100.166.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 5639921 (5.4M) [application/pdf]\n",
      "Saving to: ‘./data/history_chapter.pdf’\n",
      "\n",
      "./data/history_chap 100%[===================>]   5.38M  2.43MB/s    in 2.2s    \n",
      "\n",
      "2025-07-29 18:27:21 (2.43 MB/s) - ‘./data/history_chapter.pdf’ saved [5639921/5639921]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Get a PDF for example\n",
    "!mkdir ./data\n",
    "!wget https://ncert.nic.in/textbook/pdf/jess301.pdf -O ./data/history_chapter.pdf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "f5QiE5mJySSc"
   },
   "source": [
    "## Table creation and data ingestion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 535,
     "referenced_widgets": [
      "82207a83f49c4f98ab6fc5809675ada6",
      "1749a83edbb7400a9fca15da4c1c17da",
      "7d14bc2e580d4e1884d36c3e1a864d86",
      "f5fb97971bec45a59b6b01722f56f640",
      "8232b4e365744ea7a66f2a7dd8a37221",
      "3e54ec9b538644489d566a6af9e58ae0",
      "fe77a416747840b8b642685340072451",
      "88274132a17a4552860a4f7f99f80bae",
      "affd89f113c044c8b57c85c6da5e9df6",
      "8ac64e48cbda4cf0b487009d8226ff7e",
      "1370cbbc3a254affb8f0fa1088ce757d",
      "28df33facb374853b3de30be3d856b5d",
      "88419bf3bd0245da87c426e38fe5b4fb",
      "d65fb1cfd4bf4661ad60f7b23e532223",
      "ea480634baee43a9835b9b85292ad3a0",
      "570917dafb5f4b30a327fdf58ad00a2f",
      "718a799d42394bcf80c302d4b6000c05",
      "b9dc02358c644c6ca8dd78a3f8f8f316",
      "b6f2fd3bfa7647cbbf7626128d3d513d",
      "fbdcfc358f6742a4a0277ccf0a0efb76",
      "8190d9206e7a4adeb989bd55754ebde1",
      "aee8812da41941aeba11b0a21d5358dd",
      "598d7eab090641429b4b2147b5a08ca7",
      "2dde84554ed54376885b68a13310cd71",
      "14631ede8f1c46639cffb7751cbf4862",
      "28c97b808e6a4a69897431e173c1e72c",
      "7007b58e2f274b6d8e7e622502ef3b44",
      "023d8b7061b649d5bdd029614363958d",
      "b4517150a3df45818252519111419980",
      "29dd7a22cd1049308b7f55b40ce0a976",
      "c7d0c161a95349679740e8f0218a127d",
      "58ef4af474c1465293f31c091f9c711f",
      "a9f84a8912e549b4ab97c2702fdf875f",
      "2f3754968a434a2dae5d659f4bd9cc10",
      "062cb111090e44df99d877374e6e2f54",
      "d770780da6b0465a94c2a9f6e296eef5",
      "515bd7ff842f4837924636cb5c1750f0",
      "b62915473529427888a9ff8a3c55ba23",
      "9ca750ae7f80401eb37bd0d2281346ee",
      "b3d5f71ae71f4ebb8feaf6001b0721cb",
      "d56603515c5e482596071382b412ab7c",
      "86b2a0cb1d0d425494d3f716de52cb4c",
      "3fa02a5a533244ee95159419ae4a3084",
      "c5f1cdbcdefd44d6b040976debc48f38",
      "bc83bbc97dd94228b7d5d5683b22726c",
      "ffa40b8089c14261abde105eb534102d",
      "b64f2613dc864afe8be048607a6bbcdb",
      "5bcfa2e6ba8a41abb5c203087b2fa91b",
      "01e30ad4276b40ee8a5748d745af8a85",
      "183585a0f33c4a5697c5643ea76c9804",
      "111c194af36f4ef5bb379f8e53a4b2f1",
      "a0e90324c5364329b88551cfda90187d",
      "cc9296525a184d8f83d47c40d6099c40",
      "2e5b2acd276a46bf926c787444f4f1c1",
      "f24ae4ba5dd6424290412cedd04403e3",
      "66a1aaec3ab240e1822f8f3ac266ae9d",
      "49f0fda7b3ab454fb6b7f777f062fdc4",
      "1ffd485fe4a740fd99eb7eb7c7b321ec",
      "69654c96e3d241eeaf4c1bbddc37d0b5",
      "07b83ed1ed22460ea66d9c175abdf6b0",
      "c3839674f1a5474995030d78c1001f52",
      "fdbfc731a9dc4af4a9736270d403116a",
      "4558bd6233a54ba5b68f4e6b94c09632",
      "95d01b75c7134dcdab600e448f3fa413",
      "bcf4ba491d574a14b85314c001ddedba",
      "52d2f3c6f3b6450eba0bf8e0b16c8a9d",
      "c52e4d649e5449b988814231eac776df",
      "99fabb24f4274edb85a838f5e8975d4d",
      "2bc67790214741bcabd71d6993581c15",
      "f4dd22b374414d8cb06a21bb9ed096af",
      "532e4df3f25d427593bfb1d7c04a66c1",
      "2683d39ca809446cb383760dc2b17b8e",
      "efc59ccd8ca44bf88effeb842962e4de",
      "02c0b0ec5c504cc0bbe3786f2a3e73f7",
      "7db1fb181e384c099b420298ef436f32",
      "ecdce120b00f444091c61820e9563420",
      "8b37109edcee482885ccf9bd4009a886",
      "cd1cd2552e6c40e1b392283c41c118ae",
      "b2988f8a557d4993b99ca31683bd6bb0",
      "8dd226660ee34032841a2a9f96e019ff",
      "f4f5590c03c34c22849e7db9e9a0b068",
      "e4db45a0b040410e98da6aaee4f45c62",
      "5c93f388644a4eae8dcba2194bf8fffd",
      "fe8e9e43c78e4b9ba3066cf504f2866b",
      "577e1b4d802148029f899a88fab2edbb",
      "cf94535647d048979df34ab20ac0021e",
      "595d804f347d4b458bc9bed8e5ab9b0d",
      "f98cb06ea20c49ef93c3c93e2300b643",
      "8c64c11afc5449649b5d9674f5cd2420",
      "dcd81496ab7c4cec855d9ee1f2fa6b9a",
      "e31699e9bb754cea94570c17c498748e",
      "ca1a1ef319094704bce47d265cd722b2",
      "e5b614447aa7470cad79ae6e9fe60f44",
      "0a9367fa39ac43579114e98c8b4e7379",
      "44a25ba27ac341328f3fab5b7579b068",
      "073eae8463994c5f8ad4ce61c608ad32",
      "01768efe3b084412acd0b4b8a0b6a625",
      "a9b99a37474e4616a5bcbc2c38727436",
      "f8ef62c833b74b89bba4cc6cdff97097",
      "8c4ada30953e40259c0e4081b358f273",
      "e11a2029c51e48569d99ccda3254aa7d",
      "af436e74af774ae8900c8a9beed81b66",
      "15fcd68650764a42a8d83dae9b028917",
      "e85ac0ac7e514e7c941871378be0e0e7",
      "a0a724532c424b838993657fbadf5a7d",
      "2e6fe24b294b475b9ca314a5b87ffb01",
      "c4847e7f62ee40b28391f425d1a13965",
      "ca80d5826ec74124a4c002f318a735f0",
      "239c27cd321142d98acd48330a878400",
      "0bd989d4745249f2abcd09d8511824a3",
      "9793af2f1b9d459098a340981378bb2c",
      "6b23d8f095354c449d961a6170914c6d",
      "91b48d0b6fdd4f0c932d5c4e936e5e10",
      "ec970d3a85014412a060374a8d126d80",
      "b4c559cb92114f148704207e60fec160",
      "acab98eb07cb478eb3f347f5d9b589fa",
      "e2a64d831b254a879931b6e9db003b7e",
      "4df39cd371e44fcc82075f05f61b79d4",
      "85aea18c3f164dab8f6584844d17f0af",
      "1786e7bd7aa14940ac6389bb34919faa",
      "421fe8ebb1d5498db894e946c08d5d4b"
     ]
    },
    "id": "SsWMgv9fvxvL",
    "outputId": "d82ba33c-d4eb-459f-c9ec-ef1df3a59b00"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
      "You will be able to reuse this secret in all of your notebooks.\n",
      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
      "  warnings.warn(\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "82207a83f49c4f98ab6fc5809675ada6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "28df33facb374853b3de30be3d856b5d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "598d7eab090641429b4b2147b5a08ca7",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "README.md: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2f3754968a434a2dae5d659f4bd9cc10",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bc83bbc97dd94228b7d5d5683b22726c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "66a1aaec3ab240e1822f8f3ac266ae9d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c52e4d649e5449b988814231eac776df",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cd1cd2552e6c40e1b392283c41c118ae",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.txt: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8c64c11afc5449649b5d9674f5cd2420",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json: 0.00B [00:00, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8c4ada30953e40259c0e4081b358f273",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9793af2f1b9d459098a340981378bb2c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py:1750: FutureWarning: `encoder_attention_mask` is deprecated and will be removed in version 4.55.0 for `BertSdpaSelfAttention.forward`.\n",
      "  return forward_call(*args, **kwargs)\n"
     ]
    }
   ],
   "source": [
    "# Import Libraries\n",
    "import os, re, random, json\n",
    "import pandas as pd\n",
    "import torch\n",
    "import lancedb\n",
    "from lancedb.embeddings import get_registry\n",
    "from lancedb.pydantic import LanceModel, Vector\n",
    "from tqdm.auto import tqdm\n",
    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
    "from langchain.docstore.document import Document\n",
    "import fitz\n",
    "from typing import List\n",
    "\n",
    "pd.set_option(\"max_colwidth\", 750)  # For visibility\n",
    "\n",
    "model = (\n",
    "    get_registry()\n",
    "    .get(\"sentence-transformers\")\n",
    "    .create(\n",
    "        name=\"BAAI/bge-small-en-v1.5\",\n",
    "        device=(\"cuda\" if torch.cuda.is_available() else \"cpu\"),\n",
    "    )\n",
    ")  # For embedding\n",
    "\n",
    "\n",
    "def read_pdf_to_string(path):\n",
    "    \"\"\"\n",
    "    Read a PDF document from the specified path and return its content as a string.\n",
    "\n",
    "    Args:\n",
    "        path (str): The file path to the PDF document.\n",
    "\n",
    "    Returns:\n",
    "        str: The concatenated text content of all pages in the PDF document.\n",
    "\n",
    "    The function uses the 'fitz' library (PyMuPDF) to open the PDF document, iterate over each page,\n",
    "    extract the text content from each page, and append it to a single string.\n",
    "    \"\"\"\n",
    "    doc = fitz.open(path)  # Open the PDF document located at the specified path\n",
    "    content = \"\"\n",
    "\n",
    "    for page_num in range(len(doc)):  # Iterate over each page in the document\n",
    "        page = doc[page_num]  # Get the current page\n",
    "        content += (\n",
    "            page.get_text()\n",
    "        )  # Extract the text content from the current page and append it to the content string\n",
    "    return content\n",
    "\n",
    "\n",
    "def split_text_to_chunks_with_indices(\n",
    "    text: str, chunk_size: int, chunk_overlap: int\n",
    ") -> List[Document]:\n",
    "    chunks = []\n",
    "    start = 0\n",
    "    while start < len(text):\n",
    "        end = start + chunk_size\n",
    "        chunk = text[start:end]\n",
    "        chunks.append(\n",
    "            Document(page_content=chunk, metadata={\"index\": len(chunks), \"text\": text})\n",
    "        )\n",
    "        start += chunk_size - chunk_overlap\n",
    "    return chunks\n",
    "\n",
    "\n",
    "content = read_pdf_to_string(\"/content/data/history_chapter.pdf\")\n",
    "\n",
    "CHUNK_SIZE = 512\n",
    "CHUNK_OVERLAP = 128\n",
    "\n",
    "text_splitter = RecursiveCharacterTextSplitter(\n",
    "    # Set a really small chunk size, just to show.\n",
    "    chunk_size=CHUNK_SIZE,\n",
    "    chunk_overlap=CHUNK_OVERLAP,\n",
    "    length_function=len,\n",
    "    is_separator_regex=False,\n",
    ")\n",
    "\n",
    "texts = text_splitter.create_documents([content])\n",
    "\n",
    "\n",
    "# Create the table\n",
    "\n",
    "\n",
    "class Schema(LanceModel):\n",
    "    text: str = (\n",
    "        model.SourceField()\n",
    "    )  # the Columns (field) in DB whose Embedding we'll create\n",
    "    chunk_index: int\n",
    "    vector: Vector(model.ndims()) = model.VectorField()  # Default field\n",
    "\n",
    "\n",
    "chunks = []\n",
    "for index, doc in enumerate(texts):\n",
    "    chunks.append({\"text\": doc.page_content, \"chunk_index\": index + 1})\n",
    "\n",
    "MAX_CHUNK_INDEX = (\n",
    "    index + 1\n",
    ")  # we'll need this for our logic to get the final chunk index that exists in DB\n",
    "\n",
    "db = lancedb.connect(\"./db\")\n",
    "table = db.create_table(\"documents\", schema=Schema)\n",
    "\n",
    "table.add(chunks)  # ingest docs with auto-vectorization\n",
    "table.create_fts_index(\n",
    "    \"text\"\n",
    ")  # Create a fts index before so that we can use BM-25 later if we want to use Hybrid search"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "NhnST-5DypFI"
   },
   "source": [
    "So we have created our table where each text chink has an index associated with it. Let's now do a simple search."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 247
    },
    "id": "WCAj0aS6v4Jn",
    "outputId": "614d31ab-5923-4127-8d60-9f24235453ea"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \"initial_results\",\n  \"rows\": 3,\n  \"fields\": [\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          \"the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.\",\n          \"should be preserved. Most conservatives, however, did not propose\\na return to the society of pre-revolutionary days. Rather, they realised,\\nfrom the changes initiated by Napoleon, that modernisation could\\nin fact strengthen traditional institutions like the monarchy. It could\\nmake  state  power more effective and strong. A modern army, an\\nefficient bureaucracy, a dynamic economy, the abolition of feudalism\\nand serfdom could strengthen the autocratic monarchies of Europe.\",\n          \"which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"chunk_index\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 21,\n        \"min\": 8,\n        \"max\": 49,\n        \"num_unique_values\": 3,\n        \"samples\": [\n          16,\n          49,\n          8\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"_distance\",\n      \"properties\": {\n        \"dtype\": \"float32\",\n        \"num_unique_values\": 3,\n        \"samples\": [\n          0.6005803346633911,\n          0.6166247129440308,\n          0.6333437561988831\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-5901ab47-4e22-43f3-86ae-4bde87cc3072\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>chunk_index</th>\n",
       "      <th>_distance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.</td>\n",
       "      <td>16</td>\n",
       "      <td>0.600580</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>should be preserved. Most conservatives, however, did not propose\\na return to the society of pre-revolutionary days. Rather, they realised,\\nfrom the changes initiated by Napoleon, that modernisation could\\nin fact strengthen traditional institutions like the monarchy. It could\\nmake  state  power more effective and strong. A modern army, an\\nefficient bureaucracy, a dynamic economy, the abolition of feudalism\\nand serfdom could strengthen the autocratic monarchies of Europe.</td>\n",
       "      <td>49</td>\n",
       "      <td>0.616625</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will</td>\n",
       "      <td>8</td>\n",
       "      <td>0.633344</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-5901ab47-4e22-43f3-86ae-4bde87cc3072')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-5901ab47-4e22-43f3-86ae-4bde87cc3072 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-5901ab47-4e22-43f3-86ae-4bde87cc3072');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-3093c6ec-7d0d-4362-86bc-f91db8a900eb\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-3093c6ec-7d0d-4362-86bc-f91db8a900eb')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-3093c6ec-7d0d-4362-86bc-f91db8a900eb button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                text  \\\n",
       "0                             the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.   \n",
       "1  should be preserved. Most conservatives, however, did not propose\\na return to the society of pre-revolutionary days. Rather, they realised,\\nfrom the changes initiated by Napoleon, that modernisation could\\nin fact strengthen traditional institutions like the monarchy. It could\\nmake  state  power more effective and strong. A modern army, an\\nefficient bureaucracy, a dynamic economy, the abolition of feudalism\\nand serfdom could strengthen the autocratic monarchies of Europe.   \n",
       "2          which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will   \n",
       "\n",
       "   chunk_index  _distance  \n",
       "0           16   0.600580  \n",
       "1           49   0.616625  \n",
       "2            8   0.633344  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "TOP_K = 3  # How many similar chunks to retrieve\n",
    "NEIGHBOUR_WINDOW = 1  # 1 means 1 before and 1 after\n",
    "\n",
    "QUERY = \"What did the the revolution proclaim and what did the centralised administrative system do?\"\n",
    "\n",
    "\n",
    "initial_results = table.search(QUERY).limit(\n",
    "    TOP_K\n",
    ")  # Get all the similar chunks which are sorted by distance by default\n",
    "\n",
    "initial_results.to_pandas().drop(\"vector\", axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "lrFLbezazHQV"
   },
   "source": [
    "The important chunks from the query are 14, 86, and 16.\n",
    "\n",
    "Using a ```NEIGHBOUR_WINDOW=1```, we get the following chunk IDs: 13, 14, 15; 85, 86, 87; and 15, 16, 17.\n",
    "\n",
    "Notice ***chunk ID 15 appears in two groups***. It makes sense to associate it with the higher priority group (14), which has the minimum distance. Now, let's write the code to get the neighbors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "X9Hpqbohv5Bo",
    "outputId": "4f35ce0a-7a9e-4d56-e870-8d12f2d93417"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{16: 0.6005803346633911,\n",
       " 15: 0.6005803346633911,\n",
       " 17: 0.6005803346633911,\n",
       " 49: 0.6166247129440308,\n",
       " 48: 0.6166247129440308,\n",
       " 50: 0.6166247129440308,\n",
       " 8: 0.6333437561988831,\n",
       " 7: 0.6333437561988831,\n",
       " 9: 0.6333437561988831}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "similar_chunk_indices = {}  # store previous and next neighbour chunk\n",
    "\n",
    "for (\n",
    "    i\n",
    ") in (\n",
    "    initial_results.to_list()\n",
    "):  # Get all the similar chunks and their neighbour indices\n",
    "    index = i[\"chunk_index\"]\n",
    "    similar_chunk_indices[index] = i[\"_distance\"]\n",
    "\n",
    "    for near in range(1, NEIGHBOUR_WINDOW + 1):\n",
    "        if (max(0, index - near)) not in similar_chunk_indices:  # Previous neighbour\n",
    "            similar_chunk_indices[(max(0, index - near))] = i[\n",
    "                \"_distance\"\n",
    "            ]  # This chunk will also have the same distance\n",
    "\n",
    "        if (\n",
    "            min(index + near, MAX_CHUNK_INDEX) not in similar_chunk_indices\n",
    "        ):  # Next neighbour\n",
    "            similar_chunk_indices[min(index + near, MAX_CHUNK_INDEX)] = i[\"_distance\"]\n",
    "\n",
    "similar_chunk_indices  # Look at the index 15. It is a part of 14 and 16 both"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "CO3MlAMTzmjy"
   },
   "source": [
    "Now let's group and rerank these chunks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bjxbgtTewCD2",
    "outputId": "b2670436-db69-42fe-9acd-a54988c73434"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: [15, 16, 17], 1: [48, 49, 50], 2: [7, 8, 9]}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def group_and_rerank_chunks(indices_dict: dict):\n",
    "    \"\"\"\n",
    "    function to take the {\"chunk_index\":\"distance\"} dict and return {\"priority\": indices_group_list} dict\n",
    "    \"\"\"\n",
    "\n",
    "    sorted_indices = sorted(indices_dict.keys())  # Sort the indices\n",
    "\n",
    "    # Group by distance with continuity consideration\n",
    "    groups = []\n",
    "    current_group = []\n",
    "    current_min_distance = float(\"inf\")\n",
    "\n",
    "    for i in range(len(sorted_indices)):\n",
    "        index = sorted_indices[i]\n",
    "        distance = indices_dict[index]\n",
    "\n",
    "        if not current_group:  # Start a new group\n",
    "            current_group.append(index)\n",
    "            current_min_distance = distance\n",
    "        else:\n",
    "            if index == current_group[-1] + 1:  # Check continuity\n",
    "                current_group.append(index)\n",
    "                current_min_distance = min(current_min_distance, distance)\n",
    "            else:  # Save the current group and start a new one\n",
    "                groups.append((current_min_distance, current_group))\n",
    "                current_group = [index]\n",
    "                current_min_distance = distance\n",
    "\n",
    "    if current_group:  # add the last group\n",
    "        groups.append((current_min_distance, current_group))\n",
    "\n",
    "    groups.sort(key=lambda x: x[0])  # Sort groups by minimum distance\n",
    "\n",
    "    return {i: group for i, (dist, group) in enumerate(groups)}\n",
    "\n",
    "\n",
    "# group_and_rerank_chunks({\n",
    "#         50:75, 51:75, 52:75, 53:75, 54:75, 55:75,\n",
    "#         997:1, 998:1, 999:1,\n",
    "#         5:50, 6:50, 7:50,\n",
    "#         1:100, 2:100, 3:100,\n",
    "#         8:100, 9:1000, 10:1000}) # Test this one to understand\n",
    "\n",
    "\n",
    "reranked_indices = group_and_rerank_chunks(similar_chunk_indices)\n",
    "reranked_indices  # Look at the group for 16. Even though it has more disatnce than 86 but since it's part of a continuous group, we put it before"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 609
    },
    "id": "qllz1SACwDI1",
    "outputId": "0f2cd94c-d696-4f3a-8a00-3a162845be95"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.google.colaboratory.intrinsic+json": {
       "summary": "{\n  \"name\": \"similar_results\",\n  \"rows\": 9,\n  \"fields\": [\n    {\n      \"column\": \"chunk_index\",\n      \"properties\": {\n        \"dtype\": \"number\",\n        \"std\": 18,\n        \"min\": 7,\n        \"max\": 50,\n        \"num_unique_values\": 9,\n        \"samples\": [\n          8,\n          16,\n          50\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    },\n    {\n      \"column\": \"text\",\n      \"properties\": {\n        \"dtype\": \"string\",\n        \"num_unique_values\": 9,\n        \"samples\": [\n          \"which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will\",\n          \"the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.\",\n          \"and serfdom could strengthen the autocratic monarchies of Europe.\\nIn 1815, representatives of the European powers \\u2013 Britain, Russia,\\nPrussia and Austria \\u2013 who had collectively defeated Napoleon, met\\nat Vienna to draw up a settlement for Europe. The Congress was\\nhosted by the Austrian Chancellor Duke Metternich. The delegates\\nEconomists began to think in terms of the national\\neconomy. They talked of how the nation could\\ndevelop and what economic measures could help\\nforge this nation together.\"\n        ],\n        \"semantic_type\": \"\",\n        \"description\": \"\"\n      }\n    }\n  ]\n}",
       "type": "dataframe"
      },
      "text/html": [
       "\n",
       "  <div id=\"df-7ae8427b-3434-4a53-810b-59d1fcf3e946\" class=\"colab-df-container\">\n",
       "    <div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>chunk_index</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>15</td>\n",
       "      <td>From the very beginning, the French revolutionaries\\nintroduced various measures and practices that\\ncould create a sense of collective identity amongst\\nthe French people. The ideas of la patrie (the\\nfatherland) and le citoyen (the citizen) emphasised\\nthe notion of a united community enjoying equal rights under a\\nconstitution. A new French flag, the tricolour,   was chosen to replace\\nthe former royal standard. The Estates General  was elected by the</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>16</td>\n",
       "      <td>the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>17</td>\n",
       "      <td>within its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.\\nRegional dialects were discouraged and French, as it was spoken\\nand written in Paris, became the common language of the nation.\\nThe revolutionaries further declared that it was the mission and the\\ndestiny of the French nation to liberate the peoples of Europe\\nfrom despotism, in other words to help other peoples of Europe\\nto become nations.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>48</td>\n",
       "      <td>economic nationalism strengthened the wider nationalist sentiments\\ngrowing at the time.\\n2.3 A New Conservatism after 1815\\nFollowing the defeat of Napoleon in 1815, European governments\\nwere driven by a spirit of conservatism. Conservatives believed\\nthat established, traditional institutions of state and society –  like the\\nmonarchy, the Church, social hierarchies, property and the family –\\nshould be preserved. Most conservatives, however, did not propose</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>49</td>\n",
       "      <td>should be preserved. Most conservatives, however, did not propose\\na return to the society of pre-revolutionary days. Rather, they realised,\\nfrom the changes initiated by Napoleon, that modernisation could\\nin fact strengthen traditional institutions like the monarchy. It could\\nmake  state  power more effective and strong. A modern army, an\\nefficient bureaucracy, a dynamic economy, the abolition of feudalism\\nand serfdom could strengthen the autocratic monarchies of Europe.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>50</td>\n",
       "      <td>and serfdom could strengthen the autocratic monarchies of Europe.\\nIn 1815, representatives of the European powers – Britain, Russia,\\nPrussia and Austria – who had collectively defeated Napoleon, met\\nat Vienna to draw up a settlement for Europe. The Congress was\\nhosted by the Austrian Chancellor Duke Metternich. The delegates\\nEconomists began to think in terms of the national\\neconomy. They talked of how the nation could\\ndevelop and what economic measures could help\\nforge this nation together.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>7</td>\n",
       "      <td>the world.\\nThis chapter will deal with many of the issues visualised by Sorrieu\\nin Fig. 1. During the nineteenth century, nationalism emerged as a\\nforce which brought about sweeping changes in the political and\\nmental world of Europe. The end result of these changes was the\\nemergence of the nation-state in place of the multi-national dynastic\\nempires of Europe. The concept and practices of a modern state, in\\nwhich a centralised power exercised sovereign control over a clearly</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>8</td>\n",
       "      <td>which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>9</td>\n",
       "      <td>the actions of leaders and the common people. This chapter will\\nlook at the diverse processes through which nation-states and\\nnationalism came into being in nineteenth-century Europe.\\nErnst Renan, ‘What is a Nation?’\\nIn a lecture delivered at the University of\\nSorbonne in 1882, the French philosopher Ernst\\nRenan (1823-92) outlined his understanding of\\nwhat makes a nation. The lecture  was\\nsubsequently published as a famous essay entitled\\n‘Qu’est-ce qu’une nation?’ (‘What is a Nation?’).</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>\n",
       "    <div class=\"colab-df-buttons\">\n",
       "\n",
       "  <div class=\"colab-df-container\">\n",
       "    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-7ae8427b-3434-4a53-810b-59d1fcf3e946')\"\n",
       "            title=\"Convert this dataframe to an interactive table.\"\n",
       "            style=\"display:none;\">\n",
       "\n",
       "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n",
       "    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n",
       "  </svg>\n",
       "    </button>\n",
       "\n",
       "  <style>\n",
       "    .colab-df-container {\n",
       "      display:flex;\n",
       "      gap: 12px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert {\n",
       "      background-color: #E8F0FE;\n",
       "      border: none;\n",
       "      border-radius: 50%;\n",
       "      cursor: pointer;\n",
       "      display: none;\n",
       "      fill: #1967D2;\n",
       "      height: 32px;\n",
       "      padding: 0 0 0 0;\n",
       "      width: 32px;\n",
       "    }\n",
       "\n",
       "    .colab-df-convert:hover {\n",
       "      background-color: #E2EBFA;\n",
       "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "      fill: #174EA6;\n",
       "    }\n",
       "\n",
       "    .colab-df-buttons div {\n",
       "      margin-bottom: 4px;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert {\n",
       "      background-color: #3B4455;\n",
       "      fill: #D2E3FC;\n",
       "    }\n",
       "\n",
       "    [theme=dark] .colab-df-convert:hover {\n",
       "      background-color: #434B5C;\n",
       "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
       "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
       "      fill: #FFFFFF;\n",
       "    }\n",
       "  </style>\n",
       "\n",
       "    <script>\n",
       "      const buttonEl =\n",
       "        document.querySelector('#df-7ae8427b-3434-4a53-810b-59d1fcf3e946 button.colab-df-convert');\n",
       "      buttonEl.style.display =\n",
       "        google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "\n",
       "      async function convertToInteractive(key) {\n",
       "        const element = document.querySelector('#df-7ae8427b-3434-4a53-810b-59d1fcf3e946');\n",
       "        const dataTable =\n",
       "          await google.colab.kernel.invokeFunction('convertToInteractive',\n",
       "                                                    [key], {});\n",
       "        if (!dataTable) return;\n",
       "\n",
       "        const docLinkHtml = 'Like what you see? Visit the ' +\n",
       "          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
       "          + ' to learn more about interactive tables.';\n",
       "        element.innerHTML = '';\n",
       "        dataTable['output_type'] = 'display_data';\n",
       "        await google.colab.output.renderOutput(dataTable, element);\n",
       "        const docLink = document.createElement('div');\n",
       "        docLink.innerHTML = docLinkHtml;\n",
       "        element.appendChild(docLink);\n",
       "      }\n",
       "    </script>\n",
       "  </div>\n",
       "\n",
       "\n",
       "    <div id=\"df-efc88721-3910-44eb-96ed-13410174ac90\">\n",
       "      <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-efc88721-3910-44eb-96ed-13410174ac90')\"\n",
       "                title=\"Suggest charts\"\n",
       "                style=\"display:none;\">\n",
       "\n",
       "<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
       "     width=\"24px\">\n",
       "    <g>\n",
       "        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n",
       "    </g>\n",
       "</svg>\n",
       "      </button>\n",
       "\n",
       "<style>\n",
       "  .colab-df-quickchart {\n",
       "      --bg-color: #E8F0FE;\n",
       "      --fill-color: #1967D2;\n",
       "      --hover-bg-color: #E2EBFA;\n",
       "      --hover-fill-color: #174EA6;\n",
       "      --disabled-fill-color: #AAA;\n",
       "      --disabled-bg-color: #DDD;\n",
       "  }\n",
       "\n",
       "  [theme=dark] .colab-df-quickchart {\n",
       "      --bg-color: #3B4455;\n",
       "      --fill-color: #D2E3FC;\n",
       "      --hover-bg-color: #434B5C;\n",
       "      --hover-fill-color: #FFFFFF;\n",
       "      --disabled-bg-color: #3B4455;\n",
       "      --disabled-fill-color: #666;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart {\n",
       "    background-color: var(--bg-color);\n",
       "    border: none;\n",
       "    border-radius: 50%;\n",
       "    cursor: pointer;\n",
       "    display: none;\n",
       "    fill: var(--fill-color);\n",
       "    height: 32px;\n",
       "    padding: 0;\n",
       "    width: 32px;\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart:hover {\n",
       "    background-color: var(--hover-bg-color);\n",
       "    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
       "    fill: var(--button-hover-fill-color);\n",
       "  }\n",
       "\n",
       "  .colab-df-quickchart-complete:disabled,\n",
       "  .colab-df-quickchart-complete:disabled:hover {\n",
       "    background-color: var(--disabled-bg-color);\n",
       "    fill: var(--disabled-fill-color);\n",
       "    box-shadow: none;\n",
       "  }\n",
       "\n",
       "  .colab-df-spinner {\n",
       "    border: 2px solid var(--fill-color);\n",
       "    border-color: transparent;\n",
       "    border-bottom-color: var(--fill-color);\n",
       "    animation:\n",
       "      spin 1s steps(1) infinite;\n",
       "  }\n",
       "\n",
       "  @keyframes spin {\n",
       "    0% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "      border-left-color: var(--fill-color);\n",
       "    }\n",
       "    20% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    30% {\n",
       "      border-color: transparent;\n",
       "      border-left-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    40% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-top-color: var(--fill-color);\n",
       "    }\n",
       "    60% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "    }\n",
       "    80% {\n",
       "      border-color: transparent;\n",
       "      border-right-color: var(--fill-color);\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "    90% {\n",
       "      border-color: transparent;\n",
       "      border-bottom-color: var(--fill-color);\n",
       "    }\n",
       "  }\n",
       "</style>\n",
       "\n",
       "      <script>\n",
       "        async function quickchart(key) {\n",
       "          const quickchartButtonEl =\n",
       "            document.querySelector('#' + key + ' button');\n",
       "          quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n",
       "          quickchartButtonEl.classList.add('colab-df-spinner');\n",
       "          try {\n",
       "            const charts = await google.colab.kernel.invokeFunction(\n",
       "                'suggestCharts', [key], {});\n",
       "          } catch (error) {\n",
       "            console.error('Error during call to suggestCharts:', error);\n",
       "          }\n",
       "          quickchartButtonEl.classList.remove('colab-df-spinner');\n",
       "          quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n",
       "        }\n",
       "        (() => {\n",
       "          let quickchartButtonEl =\n",
       "            document.querySelector('#df-efc88721-3910-44eb-96ed-13410174ac90 button');\n",
       "          quickchartButtonEl.style.display =\n",
       "            google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
       "        })();\n",
       "      </script>\n",
       "    </div>\n",
       "\n",
       "    </div>\n",
       "  </div>\n"
      ],
      "text/plain": [
       "   chunk_index  \\\n",
       "0           15   \n",
       "1           16   \n",
       "2           17   \n",
       "3           48   \n",
       "4           49   \n",
       "5           50   \n",
       "6            7   \n",
       "7            8   \n",
       "8            9   \n",
       "\n",
       "                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       text  \n",
       "0                                                From the very beginning, the French revolutionaries\\nintroduced various measures and practices that\\ncould create a sense of collective identity amongst\\nthe French people. The ideas of la patrie (the\\nfatherland) and le citoyen (the citizen) emphasised\\nthe notion of a united community enjoying equal rights under a\\nconstitution. A new French flag, the tricolour,   was chosen to replace\\nthe former royal standard. The Estates General  was elected by the  \n",
       "1                                                    the former royal standard. The Estates General  was elected by the\\nbody of active citizens and renamed the National Assembly. New\\nhymns were composed, oaths taken and martyrs commemorated,\\nall in the name of the nation. A centralised administrative system\\nwas put in place and it formulated uniform laws for all citizens\\nwithin its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.  \n",
       "2                              within its territory. Internal customs duties and dues were abolished\\nand a uniform system of weights and measures was adopted.\\nRegional dialects were discouraged and French, as it was spoken\\nand written in Paris, became the common language of the nation.\\nThe revolutionaries further declared that it was the mission and the\\ndestiny of the French nation to liberate the peoples of Europe\\nfrom despotism, in other words to help other peoples of Europe\\nto become nations.  \n",
       "3                                         economic nationalism strengthened the wider nationalist sentiments\\ngrowing at the time.\\n2.3 A New Conservatism after 1815\\nFollowing the defeat of Napoleon in 1815, European governments\\nwere driven by a spirit of conservatism. Conservatives believed\\nthat established, traditional institutions of state and society –  like the\\nmonarchy, the Church, social hierarchies, property and the family –\\nshould be preserved. Most conservatives, however, did not propose  \n",
       "4                         should be preserved. Most conservatives, however, did not propose\\na return to the society of pre-revolutionary days. Rather, they realised,\\nfrom the changes initiated by Napoleon, that modernisation could\\nin fact strengthen traditional institutions like the monarchy. It could\\nmake  state  power more effective and strong. A modern army, an\\nefficient bureaucracy, a dynamic economy, the abolition of feudalism\\nand serfdom could strengthen the autocratic monarchies of Europe.  \n",
       "5  and serfdom could strengthen the autocratic monarchies of Europe.\\nIn 1815, representatives of the European powers – Britain, Russia,\\nPrussia and Austria – who had collectively defeated Napoleon, met\\nat Vienna to draw up a settlement for Europe. The Congress was\\nhosted by the Austrian Chancellor Duke Metternich. The delegates\\nEconomists began to think in terms of the national\\neconomy. They talked of how the nation could\\ndevelop and what economic measures could help\\nforge this nation together.  \n",
       "6                   the world.\\nThis chapter will deal with many of the issues visualised by Sorrieu\\nin Fig. 1. During the nineteenth century, nationalism emerged as a\\nforce which brought about sweeping changes in the political and\\nmental world of Europe. The end result of these changes was the\\nemergence of the nation-state in place of the multi-national dynastic\\nempires of Europe. The concept and practices of a modern state, in\\nwhich a centralised power exercised sovereign control over a clearly  \n",
       "7                                 which a centralised power exercised sovereign control over a clearly\\ndefined territory, had been developing over a long period of time\\nin Europe. But a nation-state was one in which the majority of its\\ncitizens, and not only its rulers, came to develop a sense of common\\nidentity and shared history or descent. This commonness did not\\nexist from time immemorial; it was forged through struggles, through\\nthe actions of leaders and the common people. This chapter will  \n",
       "8      the actions of leaders and the common people. This chapter will\\nlook at the diverse processes through which nation-states and\\nnationalism came into being in nineteenth-century Europe.\\nErnst Renan, ‘What is a Nation?’\\nIn a lecture delivered at the University of\\nSorbonne in 1882, the French philosopher Ernst\\nRenan (1823-92) outlined his understanding of\\nwhat makes a nation. The lecture  was\\nsubsequently published as a famous essay entitled\\n‘Qu’est-ce qu’une nation?’ (‘What is a Nation?’).  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "indices_to_search = []\n",
    "for priority, indices in reranked_indices.items():\n",
    "    indices_to_search.extend(indices)\n",
    "\n",
    "similar_results = (\n",
    "    table.search()\n",
    "    .where(f\"chunk_index IN {tuple(indices_to_search)}\")\n",
    "    .limit(len(similar_chunk_indices))\n",
    "    .to_pandas()\n",
    "    .set_index(\"chunk_index\")\n",
    "    .loc[indices_to_search, :]\n",
    "    .reset_index()\n",
    ")  # Just a trick to sort the DF according to the chunk priority group\n",
    "\n",
    "\n",
    "similar_results.drop(\"vector\", axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "vTwc_u2Jz_2G"
   },
   "source": [
    "Now We simply go group by group and remove the overlapping prefix from the second entry onwards."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "m1EKuj1_wOKW",
    "outputId": "ce7459e5-626f-43f3-9486-ee55b48aa57b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "## Context - 1:\n",
      "From the very beginning, the French revolutionaries\n",
      "introduced various measures and practices that\n",
      "could create a sense of collective identity amongst\n",
      "the French people. The ideas of la patrie (the\n",
      "fatherland) and le citoyen (the citizen) emphasised\n",
      "the notion of a united community enjoying equal rights under a\n",
      "constitution. A new French flag, the tricolour,   was chosen to replace\n",
      "the former royal standard. The Estates General  was elected by thew\n",
      "hymns were composed, oaths taken and martyrs commemorated,\n",
      "all in the name of the nation. A centralised administrative system\n",
      "was put in place and it formulated uniform laws for all citizens\n",
      "within its territory. Internal customs duties and dues were abolished\n",
      "and a uniform system of weights and measures was adopted.Regional dialects were discouraged and French, as it was spoken\n",
      "and written in Paris, became the common language of the nation.\n",
      "The revolutionaries further declared that it was the mission and the\n",
      "destiny of the French nation to liberate the peoples of Europe\n",
      "from despotism, in other words to help other peoples of Europe\n",
      "to become nations.\n",
      "\n",
      "## Context - 2:\n",
      "economic nationalism strengthened the wider nationalist sentiments\n",
      "growing at the time.\n",
      "2.3 A New Conservatism after 1815\n",
      "Following the defeat of Napoleon in 1815, European governments\n",
      "were driven by a spirit of conservatism. Conservatives believed\n",
      "that established, traditional institutions of state and society –  like the\n",
      "monarchy, the Church, social hierarchies, property and the family –\n",
      "should be preserved. Most conservatives, however, did not proposey realised,\n",
      "from the changes initiated by Napoleon, that modernisation could\n",
      "in fact strengthen traditional institutions like the monarchy. It could\n",
      "make  state  power more effective and strong. A modern army, an\n",
      "efficient bureaucracy, a dynamic economy, the abolition of feudalism\n",
      "and serfdom could strengthen the autocratic monarchies of Europe.sia,\n",
      "Prussia and Austria – who had collectively defeated Napoleon, met\n",
      "at Vienna to draw up a settlement for Europe. The Congress was\n",
      "hosted by the Austrian Chancellor Duke Metternich. The delegates\n",
      "Economists began to think in terms of the national\n",
      "economy. They talked of how the nation could\n",
      "develop and what economic measures could help\n",
      "forge this nation together.\n",
      "\n",
      "## Context - 3:\n",
      "the world.\n",
      "This chapter will deal with many of the issues visualised by Sorrieu\n",
      "in Fig. 1. During the nineteenth century, nationalism emerged as a\n",
      "force which brought about sweeping changes in the political and\n",
      "mental world of Europe. The end result of these changes was the\n",
      "emergence of the nation-state in place of the multi-national dynastic\n",
      "empires of Europe. The concept and practices of a modern state, in\n",
      "which a centralised power exercised sovereign control over a clearlyf time\n",
      "in Europe. But a nation-state was one in which the majority of its\n",
      "citizens, and not only its rulers, came to develop a sense of common\n",
      "identity and shared history or descent. This commonness did not\n",
      "exist from time immemorial; it was forged through struggles, through\n",
      "the actions of leaders and the common people. This chapter willtionalism came into being in nineteenth-century Europe.\n",
      "Ernst Renan, ‘What is a Nation?’\n",
      "In a lecture delivered at the University of\n",
      "Sorbonne in 1882, the French philosopher Ernst\n",
      "Renan (1823-92) outlined his understanding of\n",
      "what makes a nation. The lecture  was\n",
      "subsequently published as a famous essay entitled\n",
      "‘Qu’est-ce qu’une nation?’ (‘What is a Nation?’).\n"
     ]
    }
   ],
   "source": [
    "final_rag_text = \"## Context - 1:\\n\"\n",
    "\n",
    "group_priority = 0  # Priority of the Chunk group\n",
    "grouped_indices = reranked_indices[group_priority]\n",
    "remove_overlap = False  # from the 2nd element in the group, remove prefix overlap\n",
    "\n",
    "\n",
    "for _, row in similar_results.iterrows():\n",
    "    chunk_index = row[\"chunk_index\"]\n",
    "\n",
    "    if remove_overlap:  # if the previous chunk is there, remove the overlap\n",
    "        final_rag_text += row[\"text\"][CHUNK_OVERLAP:]\n",
    "    else:\n",
    "        final_rag_text += row[\"text\"]\n",
    "\n",
    "    remove_overlap = True\n",
    "\n",
    "    if (\n",
    "        chunk_index == grouped_indices[-1]\n",
    "    ):  # last element of the group means the new group has started\n",
    "        group_priority += 1\n",
    "        remove_overlap = False  # new group has started so don't trim the first element\n",
    "\n",
    "        if group_priority in reranked_indices:  # If not the last key in the dict\n",
    "            final_rag_text += f\"\\n\\n## Context - {group_priority+1}:\\n\"\n",
    "            grouped_indices = reranked_indices[group_priority]\n",
    "\n",
    "\n",
    "print(final_rag_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ur6JtnVZ0MNb"
   },
   "source": [
    "## Voilla! We have enriched the context of chunks and utilized them in RAG"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "id": "Z-tTuJ5jcPwa"
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
